Group Project

Import Packages

library("ggplot2")
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library('geosphere')
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.

Importing Data

# Reading in the sample CSV of rider data we made
rider_2019_sample <- read.csv('sample.csv', stringsAsFactors = TRUE)
head(rider_2019_sample)
##   tripduration                starttime                 stoptime
## 1          564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2         1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3          763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4          915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5         1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6          267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
##   start.station.id       start.station.name start.station.latitude
## 1             3711       E 13 St & Avenue A               40.72967
## 2             3016        Kent Ave & N 7 St               40.72037
## 3              382  University Pl & E 14 St               40.73493
## 4              359       E 47 St & Park Ave               40.75510
## 5             3295 Central Park W & W 96 St               40.79127
## 6             3377     Carroll St & Bond St               40.67861
##   start.station.longitude end.station.id  end.station.name end.station.latitude
## 1               -73.98068            168   W 18 St & 6 Ave             40.73971
## 2               -73.96165           3016 Kent Ave & N 7 St             40.72037
## 3               -73.99201            459  W 20 St & 11 Ave             40.74674
## 4               -73.97499            483   E 12 St & 3 Ave             40.73223
## 5               -73.96484           3142   1 Ave & E 62 St             40.76123
## 6               -73.99037           3398   Smith St & 9 St             40.67470
##   end.station.longitude bikeid   usertype birth.year gender
## 1             -73.99456  29807 Subscriber       1994      1
## 2             -73.96165  34411 Subscriber       1974      1
## 3             -74.00776  16078 Subscriber       1961      1
## 4             -73.98890  29904 Subscriber       1964      2
## 5             -73.96094  30247   Customer       1969      0
## 6             -73.99786  20315 Subscriber       1971      1
# Reading in the weather data set
weather_data <- read.csv('NYCWeather2019.csv', stringsAsFactors = TRUE)
head(weather_data)
##       STATION                        NAME     DATE AWND PRCP SNOW SNWD TAVG
## 1 USW00094728 NY CITY CENTRAL PARK, NY US 1/1/2019   NA 0.06    0    0   NA
## 2 USW00094728 NY CITY CENTRAL PARK, NY US 1/2/2019   NA 0.00    0    0   NA
## 3 USW00094728 NY CITY CENTRAL PARK, NY US 1/3/2019   NA 0.00    0    0   NA
## 4 USW00094728 NY CITY CENTRAL PARK, NY US 1/4/2019   NA 0.00    0    0   NA
## 5 USW00094728 NY CITY CENTRAL PARK, NY US 1/5/2019   NA 0.50    0    0   NA
## 6 USW00094728 NY CITY CENTRAL PARK, NY US 1/6/2019   NA 0.00    0    0   NA
##   TMAX TMIN
## 1   58   39
## 2   40   35
## 3   44   37
## 4   47   35
## 5   47   41
## 6   49   31

Initial Data Summary

# Initial summary of rider data set
str(rider_2019_sample)
## 'data.frame':    100000 obs. of  15 variables:
##  $ tripduration           : int  564 1158 763 915 1368 267 661 1112 520 512 ...
##  $ starttime              : Factor w/ 99999 levels "2019-01-01 00:56:30.7720",..: 18803 28405 14066 41002 34169 54789 95279 5247 68397 75686 ...
##  $ stoptime               : Factor w/ 100000 levels "2019-01-01 01:34:45.0200",..: 18804 28409 14065 41001 34174 54787 95282 5246 68395 75682 ...
##  $ start.station.id       : Factor w/ 825 levels "116","119","120",..: 621 86 688 538 263 348 749 80 259 545 ...
##  $ start.station.name     : Factor w/ 894 levels "1 Ave & E 110 St",..: 352 545 760 386 250 234 797 672 440 99 ...
##  $ start.station.latitude : num  40.7 40.7 40.7 40.8 40.8 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 828 levels "116","119","120",..: 15 86 752 774 184 369 623 27 333 509 ...
##  $ end.station.name       : Factor w/ 890 levels "1 Ave & E 110 St",..: 793 549 795 350 7 714 787 371 598 92 ...
##  $ end.station.latitude   : num  40.7 40.7 40.7 40.7 40.8 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : int  29807 34411 16078 29904 30247 20315 40128 33989 29972 20897 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 1 2 1 2 2 2 ...
##  $ birth.year             : int  1994 1974 1961 1964 1969 1971 1969 1960 1972 1966 ...
##  $ gender                 : int  1 1 1 2 0 1 0 1 1 1 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 
# Initial summart of weather data set
str(weather_data)
## 'data.frame':    365 obs. of  10 variables:
##  $ STATION: Factor w/ 1 level "USW00094728": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NAME   : Factor w/ 1 level "NY CITY CENTRAL PARK, NY US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE   : Factor w/ 365 levels "1/1/2019","1/10/2019",..: 1 12 23 26 27 28 29 30 31 2 ...
##  $ AWND   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRCP   : num  0.06 0 0 0 0.5 0 0 0.17 0.06 0 ...
##  $ SNOW   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TAVG   : logi  NA NA NA NA NA NA ...
##  $ TMAX   : int  58 40 44 47 47 49 34 45 45 34 ...
##  $ TMIN   : int  39 35 37 35 41 31 25 34 34 28 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 

Adjusting Dates in Data Sets

# Creating columns of just month, day, and year
weather_data$DATE <- as.Date(weather_data$DATE, format = "%m/%d/%Y")
weather_data$Month <- format(weather_data$DATE, "%m")
weather_data$Day <- format(weather_data$DATE, "%d")
weather_data$Year <- format(weather_data$DATE, "%Y")
# Creating columns of just month, day, and year
rider_2019_sample$DATE <- as.Date(rider_2019_sample$starttime, format = "%Y-%m-%d")
rider_2019_sample$Month <- format(rider_2019_sample$DATE, "%m")
rider_2019_sample$Day <- format(rider_2019_sample$DATE, "%d")
rider_2019_sample$Year <- format(rider_2019_sample$DATE, "%Y")

Rider Age

rider_2019_sample$age <- 2019 - as.numeric(as.character(rider_2019_sample$birth.year))
rider_2019_sample <- filter(rider_2019_sample, age <= 80)

Combining Data Sets

# Combining data frames to compare data
edited_weather <- select(weather_data,
                         PRCP,
                         SNOW,
                         AWND,
                         DATE)
edited_rider <- select(rider_2019_sample, 
                       age,
                       gender,
                       usertype,
                       tripduration,
                       start.station.latitude,
                       start.station.longitude,
                       start.station.id,
                       start.station.name,
                       end.station.latitude,
                       end.station.longitude,
                       end.station.id,
                       end.station.name,
                       DATE,
                       Day,
                       Month,
                       Year)

total_data = merge(edited_weather, edited_rider, by.x="DATE", by.y="DATE", all.x=TRUE)
head(total_data)
##         DATE PRCP SNOW AWND age gender   usertype tripduration
## 1 2019-01-01 0.06    0   NA  52      1 Subscriber         1166
## 2 2019-01-01 0.06    0   NA  33      1 Subscriber          532
## 3 2019-01-01 0.06    0   NA  55      1 Subscriber          263
## 4 2019-01-01 0.06    0   NA  29      1 Subscriber          196
## 5 2019-01-01 0.06    0   NA  28      1 Subscriber          710
## 6 2019-01-01 0.06    0   NA  37      2 Subscriber          312
##   start.station.latitude start.station.longitude start.station.id
## 1               40.72037               -73.96165             3016
## 2               40.67583               -73.95617             3569
## 3               40.74517               -73.98683              474
## 4               40.72308               -73.98584             3656
## 5               40.75187               -73.97771              519
## 6               40.71422               -73.98135              502
##            start.station.name end.station.latitude end.station.longitude
## 1           Kent Ave & N 7 St             40.72080             -73.95485
## 2 Franklin Ave & St Marks Ave             40.69073             -73.95133
## 3             5 Ave & E 29 St             40.74034             -73.98955
## 4           E 2 St & Avenue A             40.72087             -73.98086
## 5       Pershing Square North             40.73222             -73.98166
## 6         Henry St & Grand St             40.72217             -73.98369
##   end.station.id             end.station.name Day Month Year
## 1           3101        N 12 St & Bedford Ave  01    01 2019
## 2           3056 Kosciuszko St & Nostrand Ave  01    01 2019
## 3            402           Broadway & E 22 St  01    01 2019
## 4            150            E 2 St & Avenue C  01    01 2019
## 5            504              1 Ave & E 16 St  01    01 2019
## 6            301            E 2 St & Avenue B  01    01 2019

Distance Between Stations

# Distance between start and end station in Meters
total_data <- mutate(total_data, 
                            distance = distHaversine(cbind(total_data$start.station.longitude,
                                                           total_data$start.station.latitude),
                                                     cbind(total_data$end.station.longitude,
                                                           total_data$end.station.latitude)))
total_data <- filter(total_data, tripduration <= 3600)
head(total_data)
##         DATE PRCP SNOW AWND age gender   usertype tripduration
## 1 2019-01-01 0.06    0   NA  52      1 Subscriber         1166
## 2 2019-01-01 0.06    0   NA  33      1 Subscriber          532
## 3 2019-01-01 0.06    0   NA  55      1 Subscriber          263
## 4 2019-01-01 0.06    0   NA  29      1 Subscriber          196
## 5 2019-01-01 0.06    0   NA  28      1 Subscriber          710
## 6 2019-01-01 0.06    0   NA  37      2 Subscriber          312
##   start.station.latitude start.station.longitude start.station.id
## 1               40.72037               -73.96165             3016
## 2               40.67583               -73.95617             3569
## 3               40.74517               -73.98683              474
## 4               40.72308               -73.98584             3656
## 5               40.75187               -73.97771              519
## 6               40.71422               -73.98135              502
##            start.station.name end.station.latitude end.station.longitude
## 1           Kent Ave & N 7 St             40.72080             -73.95485
## 2 Franklin Ave & St Marks Ave             40.69073             -73.95133
## 3             5 Ave & E 29 St             40.74034             -73.98955
## 4           E 2 St & Avenue A             40.72087             -73.98086
## 5       Pershing Square North             40.73222             -73.98166
## 6         Henry St & Grand St             40.72217             -73.98369
##   end.station.id             end.station.name Day Month Year  distance
## 1           3101        N 12 St & Bedford Ave  01    01 2019  576.0106
## 2           3056 Kosciuszko St & Nostrand Ave  01    01 2019 1707.3540
## 3            402           Broadway & E 22 St  01    01 2019  584.0158
## 4            150            E 2 St & Avenue C  01    01 2019  486.4067
## 5            504              1 Ave & E 16 St  01    01 2019 2213.1388
## 6            301            E 2 St & Avenue B  01    01 2019  907.8033

Initial Data Analysis

Gender Split in Riders

# Reclassifying the genders
# 0=unknown, 1=male, 2=female
total_data$gender <- ifelse(total_data$gender == 0, "Unknown",
                                  ifelse(total_data$gender == 1, "Male", "Female"))

# Seeing the split of genders who rented bikes
total_data %>%
  ggplot(aes(x=gender, fill=gender)) +
  geom_bar() + theme(legend.position="none") +
  ggtitle("Bike Rental Counts by Gender")

Subscriber vs Customer for Riders

# Seeing the split of user type who rented bikes
total_data %>%
  ggplot(aes(x=usertype, fill=usertype)) +
  geom_bar() +
  theme(legend.position="none") + 
  ggtitle("Bike Rental Counts by User Type")

Trip Duration

# Range of all bike rides
trip_duration_stats <- filter(total_data) %>%
# min range of tripduration  
  summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
            duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride
            duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride
            duration_sd = sd(tripduration, na.rm=TRUE)) 
  
trip_duration_stats
##   duration_range_min duration_range_max duration_mean duration_sd
## 1                 61               3599       789.341     587.415

Trip Duration with Rain

# Range of all bike rides affected by rain
total_data_rain <- filter(total_data, SNOW == 0, PRCP > 0) %>%
# min range of tripduration 
    summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
            duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride affected by rain
            duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride affected by rain
            duration_sd = sd(tripduration, na.rm=TRUE)) 

total_data_rain
##   duration_range_min duration_range_max duration_mean duration_sd
## 1                 61               3598      777.5114    575.1325

Trip Duration with Snow

# Range of all bike rides affected by snow
total_data_snow <- filter(total_data, SNOW > 0) %>%
# min range of tripduration
      summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
      duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride affected by snow
      duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride affected by snow
      duration_sd = sd(tripduration, na.rm=TRUE)) 

total_data_snow
##   duration_range_min duration_range_max duration_mean duration_sd
## 1                 62               3548      660.3067    525.4768

Trip Duration with Wind

# Range of all bike rides affected by wind
total_data_wind <- filter(total_data, SNOW == 0, PRCP == 0, AWND > 0) %>% 
# min range of tripduration
      summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
      duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride affected by snow
      duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride affected by snow
      duration_sd = sd(tripduration, na.rm=TRUE)) 

total_data_wind
##   duration_range_min duration_range_max duration_mean duration_sd
## 1                 61               3599      816.5905    601.8395
# Combine above dataframes into one dataframe for side-by-side comparison
dataframe_list = list("Total Data" = trip_duration_stats,
                      "Rain Data" = total_data_rain,
                      "Snow Data" = total_data_snow,
                      "Wind Data" = total_data_wind)
# Can also do rbind(trip_duration_stats, total_data_rain, etc) but this keeps source table names defined above
do.call(rbind, dataframe_list)
##            duration_range_min duration_range_max duration_mean duration_sd
## Total Data                 61               3599      789.3410    587.4150
## Rain Data                  61               3598      777.5114    575.1325
## Snow Data                  62               3548      660.3067    525.4768
## Wind Data                  61               3599      816.5905    601.8395

Types of Weather per Month

# Average rain per month
total_data %>%
  filter(SNOW == 0) %>%
  group_by(Month) %>% 
  summarise(average_rain = mean(PRCP, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    Month average_rain
##    <chr>        <dbl>
##  1 01          0.0790
##  2 02          0.0668
##  3 03          0.0631
##  4 04          0.122 
##  5 05          0.131 
##  6 06          0.149 
##  7 07          0.158 
##  8 08          0.100 
##  9 09          0.0217
## 10 10          0.123 
## 11 11          0.0386
## 12 12          0.170
# Average snow per month
total_data %>% 
  group_by(Month) %>% 
  summarise(average_snow = mean(SNOW, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    Month average_snow
##    <chr>        <dbl>
##  1 01          0.0303
##  2 02          0.0567
##  3 03          0.189 
##  4 04          0     
##  5 05          0     
##  6 06          0     
##  7 07          0     
##  8 08          0     
##  9 09          0     
## 10 10          0     
## 11 11          0     
## 12 12          0.0675
# Average wind speed per month
total_data %>%
  group_by(Month) %>% 
  summarise(average_wind_speed = mean(AWND, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    Month average_wind_speed
##    <chr>              <dbl>
##  1 01                NaN   
##  2 02                NaN   
##  3 03                  4.92
##  4 04                  4.35
##  5 05                  3.73
##  6 06                  4.11
##  7 07                  3.41
##  8 08                  3.85
##  9 09                  4.29
## 10 10                  5.25
## 11 11                  5.30
## 12 12                  6.34
# mean returns NaN if all values in group (ex: jan and feb) are NA

Exploratory Data Analysis - Weather Effects

Average Rain by Age

# Trip duration by age of riders and rain amount
plot_data <- total_data %>%
  filter(SNOW == 0) %>%
  group_by(age) %>%
  summarise(mean_PRCP_by_age = mean(PRCP),
            mean_duration = mean(tripduration)) 
## `summarise()` ungrouping output (override with `.groups` argument)
plot_data %>%
  ggplot(aes(x = age, y = mean_PRCP_by_age)) +
  geom_point(alpha =0.9, shape = 18, colour = "blue", size = plot_data$mean_duration/150) +
  geom_smooth(colour = "orange") 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Average Wind by Age

# Mean Wind by Age of Rider
total_data %>% 
  group_by(age) %>%
  summarise(mean_AWND_by_age = mean(AWND,na.rm = TRUE)) %>%
  ggplot(aes(x = age, y = mean_AWND_by_age)) + geom_line() + geom_smooth() 
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Rain Effects on Trip Duration

# Average ride time when it's raining
total_data %>%
  filter(PRCP > 0, SNOW == 0) %>%
  summarise(prcp_duration_mean = mean(tripduration))
##   prcp_duration_mean
## 1           777.5114
total_data %>% 
  filter(PRCP > 0, SNOW == 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") + 
  geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Snow Effects on Trip Duration

# Average ride time when it's snowing
total_data %>%
  filter(SNOW > 0) %>%
  summarise(snow_duration_mean = mean(tripduration))
##   snow_duration_mean
## 1           660.3067
total_data %>% 
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") + 
  geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Wind Effects on Trip Duration

# Average ride time when it's windy
total_data %>%
  filter(AWND > 0) %>%
  summarise(wind_duration_mean = mean(tripduration))
##   wind_duration_mean
## 1           803.9685
total_data %>% 
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") + 
  geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Weather Effects on Number of Rides over Average Ride Time

# Number of rides over average time sin weather effects
ride_num <- total_data %>%
  filter(tripduration > trip_duration_stats[1,3]) %>%
  count()
ride_num[1,1]
## [1] 37200
# Number of rides over average time with rain
rain_num <- total_data %>%
  filter(SNOW == 0, PRCP > 0, tripduration > trip_duration_stats[1,3]) %>%
  count()
rain_num[1,1]
## [1] 12452
# Number of rides over average time with snow
snow_num <- total_data %>%
  filter(SNOW > 0, tripduration > trip_duration_stats[1,3]) %>%
  count()
snow_num[1,1]
## [1] 503
# Number of rides over average time with wind
wind_num <- total_data %>%
  filter(AWND > 0, tripduration > trip_duration_stats[1,3]) %>%
  count()
wind_num[1,1]
## [1] 34200

Exploratory Data Analysis - Ride History

Speed of Rider Demographics

# Speed of the rider
total_data$speed <- total_data$distance/total_data$tripduration

# Average speed of all riders
all_ride <- total_data %>%
  summarise(average_speed = mean(speed))

# Average speed of young riders
young_ride <- total_data %>%
  filter(age <= 45) %>%
  summarise(young_average = mean(speed))

# Average speed of old riders
old_ride <- total_data %>%
  filter(age >= 65) %>%
  summarise(old_average = mean(speed))

# Average speed of female riders
fem_ride <- total_data %>%
  filter(gender == "Female") %>%
  summarise(female_average = mean(speed))

# Average speed of male riders
male_ride <- total_data %>%
  filter(gender == "Male") %>%
  summarise(male_average = mean(speed))

# Average speed of subscribers
sub_ride <- total_data %>%
  filter(usertype == "Customer") %>%
  summarise(customer_average = mean(speed))

# Average speed of customers
cust_ride <- total_data %>%
  filter(usertype == "Subscriber") %>%
  summarise(subscriber_average = mean(speed))

cbind(all_ride, young_ride, old_ride, fem_ride, male_ride, sub_ride, cust_ride)
##   average_speed young_average old_average female_average male_average
## 1      2.462556      2.538826     2.19201       2.326377     2.572124
##   customer_average subscriber_average
## 1         1.801693           2.565832
# Scatter Plot of speed by age
total_data %>%
  ggplot(aes(x = age, y = speed, colour = gender)) +
  geom_point(alpha = .4, size = 1.5) +
  scale_colour_manual(name = 'Gender',
                      values = setNames(c('blue','magenta', 'dark green'),
                                        c('Male', 'Female', 'Unknown'))) +
  geom_smooth(method='lm', colour = 'black') +
  facet_wrap(~gender)  + # make a plot per gender. look up facet_wrap for other fun ways to do this 
  labs(title="Average Speed of Riders by Age", x="Speed", y="Age")
## `geom_smooth()` using formula 'y ~ x'

# Boxplot of speed by gender
total_data %>%
  ggplot(aes(x = gender, y = speed, colour = gender)) +
  geom_boxplot(outlier.colour = 'red') +
  scale_colour_manual(name = 'Gender',
                      values = setNames(c('blue','magenta', 'dark green'),
                                        c('Male', 'Female', 'Unknown'))) +
  labs(title="Speed of Riders by Gender", x="Gender", y="Speed")

# Boxplot of speed by customer type
total_data %>%
  ggplot(aes(x = usertype, y = speed, colour = usertype)) +
  geom_boxplot(outlier.colour = 'red') +
  scale_colour_manual(name = 'User Type',
                      values = setNames(c('purple', 'orange'),
                                        c('Subscriber', 'Customer'))) +
  labs(title="Speed of Riders by Customer Type", x="Customer Type", y="Speed")

Exploratory Data Analysis - Ride History

Start Locations

top_height <- max(total_data$start.station.latitude) - min(total_data$start.station.latitude)
top_width <- max(total_data$start.station.longitude) - min(total_data$start.station.longitude)
top_borders <- c(bottom  = min(total_data$start.station.latitude)  - 0.1 * top_height,
                 top     = max(total_data$start.station.latitude)  + 0.1 * top_height,
                 left    = min(total_data$start.station.longitude) - 0.2 * top_width,
                 right   = max(total_data$start.station.longitude) + 0.2 * top_width)

start <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
start_map <- ggmap(start, extent = "device", legend = "topright")

start_map + stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..,), size = 2, bins = 10, data = total_data, geom = "polygon",  na.rm=TRUE,
) + labs( fill = "Density", title = "Start Location Density") + guides(alpha = F)

This graph shows that most bike trips in 2019 start in the center of NYC, with relatively few in the boroughs by comparison.

Start Location Preferences - by Day of Week

# convert dates to weekdays
total_data$day_of_week = weekdays(total_data$DATE)

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) + 
guides(alpha = F) + labs(fill = "Density", title = "Start Location Density by Day of Week") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ day_of_week) + 
theme(legend.position = "right")

From these charts, we can see that the Saturday and Sunday location densities are slightly more spread than the weekdays, suggesting that the weekend trips are less concentrated in the inner city, albeit still focused in Manhattan.

Start Location Preferences - by User Type

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) + 
guides(alpha = F) + labs(fill = "Density", title = "Start Location Density by User Type") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ usertype) + 
theme(legend.position = "right")

This graph shows that there is larger focus on downtown start locations for Subscribers, whereas Customers are spread out along Manhattan and are present in the boroughs as well.

Start Location Preferences - by Trip Duration

## break down by one standard deviation above and below average of trip duration
ggmap(start) +
    geom_point(data = total_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
                                        col = tripduration)) +
    scale_color_gradient(low = "yellow", high = "red")

End Location Preferences

## before noon and after noon
end <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
end_map <- ggmap(end, extent = "device", legend = "topright")

end_map + stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = total_data,
geom = "polygon"
)

End Location Preferences - by User Type

end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) + 
guides(alpha = F) + labs(fill = "Density", title = "End Location Density by Day of Week") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ day_of_week) + 
theme(legend.position = "right")

We see the same pattern breakdown in the end location density as we do the start locations, suggesting that weekends have a more spread out end locations than the weekdays.

Asymmetrical Traffic Data

total_rides = count(total_data)
test = total_data
test$start.station.name = as.character(test$start.station.name)
test$end.station.name = as.character(test$end.station.name)
test <- test[test$start.station.name==test$end.station.name, ]
same_station = count(test)

symmetric = same_station / total_rides
asymmetric = 1 - symmetric

Only {r} symmetric% of rides start and end at the same station, which means that {r} asymmetric% of rides are asymmetric traffic.

start_popularity = sort(table(total_data$start.station.name), decreasing=TRUE, na.rm=TRUE)
top10 = round(length(unique(total_data$start.station.name, na.rm=TRUE))*0.1)
top_10 = head(start_popularity, top10)
barplot(top_10)

top_starts = as.data.frame(top_10)
top_10rides = sum(top_starts$Freq)

top10_rides = top_10rides / total_rides

{r}top10_rides of bike rides start come from the top 10% most used station (which are:{r} top_10)

count_starts = as.data.frame(table(total_data$start.station.name), na.rm=TRUE)
names(count_starts) = c("station", "starts")
count_ends = as.data.frame(table(total_data$end.station.name), na.rm=TRUE)
names(count_ends) = c("station", "ends")
station_flow = as.data.frame(merge(count_starts, count_ends, by.x="station", by.y="station", all.x=TRUE, na.rm=TRUE))
station_flow$net = station_flow$starts / station_flow$ends

station_flow = na.omit(station_flow)

station_flow %>% mutate(station = fct_reorder(station, net)) %>% ggplot(aes(x=station, y=net)) + geom_bar(stat = "identity")+ geom_hline(yintercept=1, linetype="dashed", color = "red") + labs(x="Stations", y="Total Starts / Total Ends in 2019", title = "Net 2019 Flow of Bikes per Station") + theme(axis.text.x = element_blank())

The chart above depicts each station’s inflow/outflow of bikes in 2019. Those with a value greater than 1 show that they have a higher rate of bikes starting at their station than ending at their station. These stations would be important to target when thinking about rebalancing bikes, as they overall have more bikes leaving them then coming to them. Similarly, those with the lowest start/end ratios have more bikes ending at their station than leaving, making them prime candidates for moving their surplus to a station in more need.

surplus_stations = station_flow[station_flow$net < 0.75,]
deficit_stations = station_flow[station_flow$net > 1.25,]

surplus_stations 
##                                 station starts ends       net
## 22                    12 Ave & W 125 St      2    3 0.6666667
## 42                       23 Ave & 27 St     16   22 0.7272727
## 72                 31 Ave & Crescent St     21   32 0.6562500
## 79                31 St & Northern Blvd     19   27 0.7037037
## 81                       34 Ave & 13 St      5    9 0.5555556
## 84                       34 St & 35 Ave     16   27 0.5925926
## 101                 44 Dr & Jackson Ave     62   89 0.6966292
## 118                       5 St & 51 Ave     19   26 0.7307692
## 120                    5 St & Market St     11   22 0.5000000
## 142 Adam Clayton Powell Blvd & W 118 St     26   40 0.6500000
## 144 Adam Clayton Powell Blvd & W 126 St     27   38 0.7105263
## 151            Amsterdam Ave & W 125 St     35   47 0.7446809
## 169           Battery Pl & Greenwich St      5    9 0.5555556
## 174         Bedford Ave & Montgomery St      6    9 0.6666667
## 215           Bushwick Ave & Forrest St      1    2 0.5000000
## 216            Bushwick Ave & Harman St      1    3 0.3333333
## 234                Carroll St & Bond St     59   81 0.7283951
## 236           Carroll St & Franklin Ave     11   16 0.6875000
## 272              Clinton St & Centre St      9   15 0.6000000
## 282                Columbia St & W 9 St     18   25 0.7200000
## 286          Commerce St & Van Brunt St     16   22 0.7272727
## 305             DeKalb Ave & Hudson Ave    122  164 0.7439024
## 307            DeKalb Ave & Skillman St      3    6 0.5000000
## 311                Ditmars Blvd & 19 St     20   36 0.5555556
## 313            Division Ave & Hooper St      2    4 0.5000000
## 316          Division St & Bowery (old)      5    8 0.6250000
## 325             Dwight St & Van Dyke St     10   15 0.6666667
## 330            E 103 St & Lexington Ave     42   60 0.7000000
## 344                    E 118 St & 1 Ave     30   46 0.6521739
## 346              E 118 St & Madison Ave     20   40 0.5000000
## 375                     E 35 St & 3 Ave      8   12 0.6666667
## 418                     E 71 St & 2 Ave      2    4 0.5000000
## 448                  E 98 St & Park Ave     29   40 0.7250000
## 487                 Garfield Pl & 8 Ave     46   62 0.7419355
## 510                Halsey St & Broadway      1    2 0.5000000
## 515            Harrison Pl & Porter Ave      1    3 0.3333333
## 517               Hart St & Wyckoff Ave     10   16 0.6250000
## 548          Kingsland Ave & Nassau Ave      8   18 0.4444444
## 562          Lafayette St & Jersey St S     23   33 0.6969697
## 568                Lenox Ave & W 115 St     32   66 0.4848485
## 602       Marcus Garvey Blvd & Macon St     26   43 0.6046512
## 628      Morningside Dr & Amsterdam Ave     18   25 0.7200000
## 662             Pearl St & Anchorage Pl     32   45 0.7111111
## 691              Railroad Ave & Kay Ave      4    6 0.6666667
## 693            Richards St & Delavan St     11   18 0.6111111
## 730               Stagg St & Morgan Ave      9   17 0.5294118
## 741           Stewart Ave & Johnson Ave      2    3 0.6666667
## 749        Suydam St & St. Nicholas Ave      2    3 0.6666667
## 750             Throop Ave & Myrtle Ave     14   23 0.6086957
## 758              Union St & Bedford Ave     13   22 0.5909091
## 785              W 129 St & Convent Ave     13   22 0.5909091
## 830                     W 47 St & 6 Ave      2    6 0.3333333
## 886            Wilson Ave & Troutman St      0    4 0.0000000
## 887          Withers St & Kingsland Ave      3    5 0.6000000
## 889           Wyckoff Av & Jefferson St     27   41 0.6585366
deficit_stations
##                                           station starts ends      net
## 6                                  1 Ave & E 5 St      4    1 4.000000
## 13                                10 Hudson Yards     31   18 1.722222
## 20                                 11 St & 35 Ave     11    4 2.750000
## 29                                   2 Ave & 9 St     10    7 1.428571
## 37                                 21 St & 36 Ave      9    7 1.285714
## 38                                 21 St & 38 Ave      7    3 2.333333
## 43                                 24 Ave & 26 St     39   31 1.258065
## 46                                  27 Ave & 3 St      4    1 4.000000
## 48                                  27 Ave & 9 St     20   11 1.818182
## 50                                 28 Ave & 35 St     37   24 1.541667
## 52                                 28 St & 36 Ave     27   21 1.285714
## 62                                3 Ave & E 95 St     35   25 1.400000
## 68                                 30 Ave & 21 St     21   16 1.312500
## 70                                 31 Ave & 30 St     34   25 1.360000
## 82                                 34 Ave & 21 St     22   13 1.692308
## 85                                 35 Ave & 10 St     19   13 1.461538
## 92                                 37 St & 24 Ave     20   11 1.818182
## 95                                   4 Ave & 2 St      9    4 2.250000
## 97                                  40 Ave & 9 St     20   14 1.428571
## 98                           40 Ave & Crescent St     15    4 3.750000
## 100                                 44 Dr & 21 St     67   41 1.634146
## 104                                47 Ave & 31 St     21   16 1.312500
## 110                              5 Ave & E 103 St    140  106 1.320755
## 125                             6 Ave & Spring St     34   15 2.266667
## 141           Adam Clayton Powell Blvd & W 115 St     28   16 1.750000
## 158                      Atlantic Ave & Furman St    160  120 1.333333
## 162                             Avenue D & E 8 St    130   84 1.547619
## 176                          Bedford Ave & S 9 St     44   34 1.294118
## 177                      Bergen St & Flatbush Ave     90   65 1.384615
## 179                    Bergen St & Vanderbilt Ave     87   64 1.359375
## 193                         Broadway & Hancock St      2    1 2.000000
## 197                        Broadway & Roebling St    117   88 1.329545
## 218                    Bushwick Ave & McKibbin St     16    9 1.777778
## 219                      Bushwick Ave & Powers St     43   31 1.387097
## 220                       Bushwick Ave & Stagg St     12    4 3.000000
## 221                          Butler St & Court St     60   46 1.304348
## 225                          Calyer St & Jewel St      3    1 3.000000
## 241                         Cedar St & Myrtle Ave     36   28 1.285714
## 242                          Center Blvd & 48 Ave     51   37 1.378378
## 245                    Central Ave & Flushing Ave     23    7 3.285714
## 246                    Central Ave & Starr Street     31   24 1.291667
## 248 Central Park North & Adam Clayton Powell Blvd    183  140 1.307143
## 265                       Clermont Ave & Park Ave     49   37 1.324324
## 268                    Cliff St & Fulton St (Old)      9    5 1.800000
## 270                      Clinton Ave & Myrtle Ave    130   96 1.354167
## 290                          Court St & Nelson St     44   35 1.257143
## 317                       Dock 72 Way & Market St     20   15 1.333333
## 318                           Douglass St & 3 Ave     77   61 1.262295
## 323                       Duane St & Greenwich St    126   90 1.400000
## 340                              E 114 St & 1 Ave     38   29 1.310345
## 350                      E 123 St & Lexington Ave     37   29 1.275862
## 389                             E 5 St & Avenue C    140  111 1.261261
## 395                       E 53 St & Lexington Ave     24   18 1.333333
## 402                  E 58 St &  1 Ave (NW Corner)    106   84 1.261905
## 431                        E 82 St & East End Ave     78   61 1.278689
## 438                            E 88 St & Park Ave     52   41 1.268293
## 445                               E 95 St & 3 Ave     14   11 1.272727
## 447                         E 97 St & Madison Ave     94   75 1.253333
## 475                         Frost St & Meeker Ave     30   22 1.363636
## 481                         Fulton St & Irving Pl     18   14 1.285714
## 482                       Fulton St & Rockwell Pl     70   54 1.296296
## 489                        Gold St & Frankfort St      6    2 3.000000
## 495              Grand Army Plaza & Plaza St West    135   94 1.436170
## 511                      Halsey St & Tompkins Ave     47   31 1.516129
## 537                           Jackson Ave & 46 Rd     31   24 1.291667
## 541                    Jefferson St & Cypress Ave      2    1 2.000000
## 551                 Knickerbocker Ave & George St     19   10 1.900000
## 554                 Knickerbocker Ave & Thames St     15   10 1.500000
## 557                   Lafayette Ave & Classon Ave     88   69 1.275362
## 559                   Lafayette Ave & St James Pl     99   72 1.375000
## 566                    Lefferts Pl & Franklin Ave     50   39 1.282051
## 595                        Madison Ave & E 120 St     29   21 1.380952
## 601                     Manhattan Av & Leonard St      2    1 2.000000
## 603                     Marcy Ave & Lafayette Ave     34   18 1.888889
## 607                   McKibbin St & Manhattan Ave     38   24 1.583333
## 608                         Melrose St & Broadway      3    1 3.000000
## 609                      Menahan St & Central Ave      3    1 3.000000
## 621                      Monroe St & Tompkins Ave     49   37 1.324324
## 623                  Montgomery St & Franklin Ave     12    8 1.500000
## 637                            N 11 St & Kent Ave     57   43 1.325581
## 644                             Newton Rd & 44 St     20   15 1.333333
## 661                      Park Pl & Vanderbilt Ave     84   66 1.272727
## 666                      Perry St & Greenwich Ave      3    2 1.500000
## 680                          Powers St & Olive St     24   19 1.263158
## 682                     Prospect Park West & 8 St     67   53 1.264151
## 686                Putnam Ave & Knickerbocker Ave      3    2 1.500000
## 702                       Rivington St & Ridge St    176  140 1.257143
## 704                      Rogers Ave & Sterling St     28   22 1.272727
## 707                            S 4 St & Wythe Ave    205  163 1.257669
## 728               St Nicholas Ave & Manhattan Ave     96   76 1.263158
## 731                          Stagg St & Union Ave     76   60 1.266667
## 748                 Suydam St & Knickerbocker Ave     24   19 1.263158
## 776                  W 106 St & Central Park West    117   88 1.329545
## 780                           W 113 St & Broadway     64   51 1.254902
## 782                           W 116 St & Broadway     59   47 1.255319
## 783                              W 12 St & W 4 St     59   41 1.439024
## 813                              W 37 St & 10 Ave    250  198 1.262626
## 819                               W 42 St & 6 Ave      2    0      Inf
## 853                        W 88 St & West End Ave     54   36 1.500000
## 861                          Warren St & Court St     54   36 1.500000
## 869                       Waterbury St & Stagg St     14   11 1.272727
## 879                      Willoughby Ave & Hall St    107   73 1.465753
## 880                   Willoughby Ave & Myrtle Ave      8    3 2.666667
## 888                        Wolcott St & Dwight St     18   10 1.800000

Stations with surplus:
{r} surplus_stations

Stations with deficit: {r} deficit_stations